#! /usr/bin/env python3

import sys
import csv
import zipfile



# Dictionary of words, value is None if not seen or a word vector...
words = {}



# Load ner_dataset.csv and add every word to the words dictionary...
with open('ner_dataset.csv', newline='') as fin:
  first = True
  for row in csv.reader(fin):
    if first:
      first = False
      continue
    
    words[row[1].strip().lower()] = None

print('Found {} unique words'.format(len(words)))



# Try and collect word vectors from complete glove database...
done = 0
percentage = 0

with zipfile.ZipFile('glove.42B.300d.zip', 'r') as zipin:
  with zipin.open('glove.42B.300d.txt') as fin:
    for line in fin:
      line = line[:-1].decode('utf8') # :-1 to cut newline
      toks = line.split()
      
      if toks[0] in words:
        if words[toks[0]] is not None:
          print('Error: {} appears more than once'.format(toks[0]))
          
        words[toks[0]] = tuple(float(v) for v in toks[1:])
        done += 1
      
      p = (done * 100) // len(words)
      if p>percentage:
        percentage = p
        print('\r{: 2d}%'.format(percentage), end='')
        sys.stdout.flush()
    print()

print('Found {} word vectors'.format(done))
print()



# Report on missing tokens...
if done < len(words):  
  print('Missing {} word vectors'.format(len(words) - done))
  print('Example missing words:')
  limit = 10
  for word, state in words.items():
    if state is None:
      print('  {}'.format(word))
      limit -= 1
      if limit==0:
        break
  print()



# Dump the baby glove...
keys = list(words.keys())
keys.sort()

with zipfile.ZipFile('baby_glove.zip', 'w', zipfile.ZIP_DEFLATED) as zipout:
  with zipout.open('baby_glove.txt', 'w') as fout:
    for key in keys:
      if words[key] is not None:
        line = [key, ' ', ' '.join('{:g}'.format(v) for v in words[key]), '\n']
        line = ''.join(line)
        
        fout.write(line.encode('utf8'))

print('Baby glove knitted')
print()
